import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
#dataload
BRCA1_raw = pd.read_csv("Resources/BRCA1_HUMAN_Findlay_2018.csv")
#code to split our dataframe into its domains
RING = BRCA1_raw[:575]
BRCT = BRCA1_raw[575:]
# add column named: 'position'
df=BRCA1_raw.copy()
import re
def get_number_from_text(text):
return re.findall(r'\d{1,4}',text)[0]
df['position']=df['mutant'].apply(get_number_from_text)
# add column containing the newly introduced Aminoacid
test1=df.mutant.str.extract(r'(\d)([ARNDCQEGHILKTMfPOUSTWYV])')
df["new_aa"]=test1[1]
# dataset with only bin0s
df_filtered = df[df['DMS_score_bin'] == 0]
#CHECKING FOR SECONDARY STRUCTURES -> based on Psipred
#Helices:
helices = ['8', '9', '10',"11","12","13","14","15","16","17","18","19","20","21","22","28","29","30","31","41","42","43","44","45","46","47","48","49","50","51","52","70","71","72","73","78","79","80","81","82","83","84","85","86","87","88","89","90","91","92","93","94","95","96","97","1659","1660","1661","1662","1663","1664","1665","1666","1667","1668","1669","1670","1671","1701","1702","1703","1704","1705","1706","1707","1708","1716","1717","1718","1719","1720","1721","1722","1723","1724","1725","1749","1750","1751","1752","1753","1754","1755","1777","1778","1779","1780","1781","1782","1783","1784","1785","1786","1820","1821","1822","1823","1824","1825","1826","1827","1835","1836","1837","1838","1839","1840","1841","1842","1843","1844"]
helices_df = df.loc[df['position'].astype(str).isin(helices)]
#Helices_bin0:
helices_df_bin0 = helices_df.loc[helices_df['DMS_score_bin'] == 0].copy()
#Strands:
strands = ["1650","1651","1652","1653","1654","1674","1675","1676","1677","1684","1685","1686","1687","1688","1689","1712","1713","1714","1715","1765","1766","1767","1768","1769","1807","1808","1809","1810"]
strands_df = df.loc[df['position'].astype(str).isin(strands)]
#Strands_bin0:
strands_df_bin0 = strands_df.loc[strands_df['DMS_score_bin'] == 0].copy()
#Coils (improved technique :D):
df['position'] = df['position'].astype(int)
coils = list(range(1, 8)) + list(range(22, 35)) + list(range(37, 45))+ list(range(55, 67)) + list(range(69, 77)) + list(range(98, 102)) + list(range(1631, 1651))+ list(range(1656, 1660)) + list(range(1673, 1676)) + list(range(1679, 1686)) + list(range(1691, 1702)) + list(range(1710, 1715))+ list(range(1727, 1736))+ list(range(1738, 1753))+ list(range(1757, 1767))+ list(range(1771, 1778))+ list(range(1788, 1791))+ list(range(1794, 1796))+ list(range(1799, 1806))+ list(range(1812, 1821))+ list(range(1829, 1833))+ list(range(1846, 1851))+ list(range(1854, 1865))
coils_df = df.loc[df['position'].isin(coils)]
#Coils_bin0:
coils_df_bin0 = coils_df.loc[coils_df['DMS_score_bin'] == 0].copy()
# Testing for overlaps:
set1 = set(helices)
set2 = set(strands)
set3 = set(coils)
common_elements = set1.intersection(set2, set3)
common_elements_list = list(common_elements)
common_elements_list
#-> no overlaps, yay
[]
#RING:
#Helices in RING
helices_RING = ['8', '9', '10',"11","12","13","14","15","16","17","18","19","20","21","22","28","29","30","31","41","42","43","44","45","46","47","48","49","50","51","52","70","71","72","73","78","79","80","81","82","83","84","85","86","87","88","89","90","91","92","93","94","95","96","97"]
helices_df_RING = df.loc[df['position'].astype(str).isin(helices_RING)]
helices_df_RING_bin0 = helices_df_RING.loc[helices_df_RING['DMS_score_bin'] == 0].copy()
#strands in RING (there are none)
#coils in RING
df['position'] = df['position'].astype(int)
coils_RING = list(range(1, 8)) + list(range(22, 35)) + list(range(37, 45))+ list(range(55, 67)) + list(range(69, 77)) + list(range(98, 102))
coils_df_RING = df.loc[df['position'].isin(coils_RING)]
coils_df_RING_bin0 = coils_df_RING.loc[coils_df_RING['DMS_score_bin'] == 0].copy()
#Barplot + ratios:
df_counts = [len(helices_df_RING),len(helices_df_RING_bin0),0,0, len(coils_df_RING),len(coils_df_RING_bin0)]
df_names = ['α-helices',"h_bin0", 'β-strands',"s_bin0",'coils',"c_bin0"]
colors = ['crimson', 'crimson', 'indigo', 'indigo', 'dimgrey', 'dimgrey']
plt.bar(df_names, df_counts,color=colors)
plt.xlabel('Secondary structures')
plt.ylabel('Amount of SNVs')
plt.title('SNVs per secondary structure type: RING')
plt.show()
dataframes = [helices_df_RING, helices_df_RING_bin0, coils_df_RING, coils_df_RING_bin0]
df_names = ['helices_R', 'helices_bin0_R', 'coils_R', 'coils_bin0_R']
ratios = []
for i in range(1, len(dataframes), 2):
ratio = (len(dataframes[i]) / len(dataframes[i-1])) * 100
ratios.append(ratio)
for i, ratio in enumerate(ratios):
print(f"ratio: {df_names[i*2+1]} to {df_names[i*2]}: {ratio:.2f}%")
ratio: helices_bin0_R to helices_R: 30.77% ratio: coils_bin0_R to coils_R: 29.58%
#Chi2, RING:
#H0: no correlation
len_hRING=len(helices_df_RING)
len_hRING0=len(helices_df_RING_bin0)
len_cRING=len(coils_df_RING)
len_cRING0=len(coils_df_RING_bin0)
observed_data_RING = np.array([[len_cRING, len_hRING],[len_cRING0, len_hRING0]])
chi2, p_value, dof, expected = chi2_contingency(observed_data_RING)
print(f"Chi-square statistic: {chi2}")
print(f"P-value: {p_value}")
print(f"Degrees of freedom: {dof}")
print("Expected counts:")
print(expected)
Chi-square statistic: 0.022393889272595308 P-value: 0.8810440090537792 Degrees of freedom: 1 Expected counts: [[282.61286255 326.38713745] [ 85.38713745 98.61286255]]
Since the P-value is >0.05 there is no strong indication to reject the H0 hypothesis. So in this domain the chances of a SNV leading to a bin_0 score is the same no matter what secondary structure.
#Vizualizing secondary structures in RING
RING = df[:575]
new_RING = RING.copy()
new_RING['secondary structures'] = RING['position'].apply(lambda x: 'H' if str(x) in helices_RING else 'C')
positions = new_RING['position']
structures = new_RING['secondary structures']
colors = ['crimson' if structure == 'H' else 'dimgrey' for structure in structures]
# scatter plot:
plt.scatter(positions, structures, c=colors)
plt.xticks(positions, rotation='vertical')
plt.xlabel('Position', fontsize=15)
plt.ylabel('Secondary Structures', fontsize=15)
plt.title('Position vs. Secondary Structure - RING', fontsize=20)
fig = plt.gcf()
fig.set_size_inches(20, 3)
plt.gca().yaxis.set_tick_params(labelsize=10)
plt.show()
#secondary structure + dms score_RING:
new_RING_agg = new_RING.pivot_table(index='secondary structures', columns='mutant', values='DMS_score')
# Get the order of mutants from the original DataFrame
mutant_order = new_RING['mutant'].unique()
# Reindex the columns of the pivot table to match the desired order
new_RING_agg = new_RING_agg.reindex(index=new_RING_agg.index[::-1],columns=mutant_order)
# Plot creation
plt.figure(figsize=(150, 30))
# heatmap
from matplotlib.colors import ListedColormap
colors = ['indianred', 'royalblue']
cmap = ListedColormap(colors)
heatmap=sns.heatmap(new_RING_agg, cmap=cmap,linewidths=0.5, fmt=".2f",cbar_kws={"label":"DMS_scores"})
plt.xlabel("Mutant", fontsize=100)
plt.ylabel("Secondary Structures", fontsize=100)
heatmap.set_yticklabels(heatmap.get_yticklabels(), fontsize=70)
cbar = heatmap.collections[0].colorbar
cbar.ax.tick_params(labelsize=70)
cbar.ax.set_ylabel('DMS_score_bin', fontsize=100)
data_min = new_RING_agg.min().min()
data_max = new_RING_agg.max().max()
cbar.set_ticks([data_min, data_max])
cbar.set_ticklabels([0, 1])
plt.show()
#BRCT:
#Helices in BRCT
helices_BRCT = ["1659","1660","1661","1662","1663","1664","1665","1666","1667","1668","1669","1670","1671","1701","1702","1703","1704","1705","1706","1707","1708","1716","1717","1718","1719","1720","1721","1722","1723","1724","1725","1749","1750","1751","1752","1753","1754","1755","1777","1778","1779","1780","1781","1782","1783","1784","1785","1786","1820","1821","1822","1823","1824","1825","1826","1827","1835","1836","1837","1838","1839","1840","1841","1842","1843","1844"]
helices_df_BRCT = df.loc[df['position'].astype(str).isin(helices_BRCT)]
helices_df_BRCT_bin0 = helices_df_BRCT.loc[helices_df_BRCT['DMS_score_bin'] == 0].copy()
#strands in BRCT
strands_BRCT = ["1650","1651","1652","1653","1654","1674","1675","1676","1677","1684","1685","1686","1687","1688","1689","1712","1713","1714","1715","1765","1766","1767","1768","1769","1807","1808","1809","1810"]
strands_df_BRCT = df.loc[df['position'].astype(str).isin(strands_BRCT)]
strands_df_BRCT_bin0 = strands_df_BRCT.loc[strands_df_BRCT['DMS_score_bin'] == 0].copy()
#coils in BRCT
df['position'] = df['position'].astype(int)
coils_BRCT = list(range(1631, 1651))+ list(range(1656, 1660)) + list(range(1673, 1676)) + list(range(1679, 1686)) + list(range(1691, 1702)) + list(range(1710, 1715))+ list(range(1727, 1736))+ list(range(1738, 1753))+ list(range(1757, 1767))+ list(range(1771, 1778))+ list(range(1788, 1791))+ list(range(1794, 1796))+ list(range(1799, 1806))+ list(range(1812, 1821))+ list(range(1829, 1833))+ list(range(1846, 1851))+ list(range(1854, 1865))
coils_df_BRCT = df.loc[df['position'].isin(coils_BRCT)]
coils_df_BRCT_bin0 = coils_df_BRCT.loc[coils_df_BRCT['DMS_score_bin'] == 0].copy()
#Barplot+ratios:
df_counts = [len(helices_df_BRCT),len(helices_df_BRCT_bin0),len(strands_df_BRCT),len(strands_df_BRCT_bin0), len(coils_df_BRCT),len(coils_df_BRCT_bin0)]
df_names = ['α-helices',"h_bin0", 'β-strands',"s_bin0",'coils',"c_bin0"]
colors = ['crimson', 'crimson', 'indigo', 'indigo', 'dimgrey', 'dimgrey']
plt.bar(df_names, df_counts,color=colors)
plt.xlabel('Secondary structures')
plt.ylabel('Amount of SNVs')
plt.title('SNVs per secondary structure type: BRCT')
plt.show()
dataframes = [helices_df_BRCT, helices_df_BRCT_bin0, strands_df_BRCT,strands_df_BRCT_bin0,coils_df_BRCT, coils_df_BRCT_bin0]
df_names = ['helices_B', 'h_bin0_B',"strands_B","s_bin0_B", 'coils_B', 'coils_bin0_B']
ratios = []
for i in range(1, len(dataframes), 2):
ratio = (len(dataframes[i]) / len(dataframes[i-1])) * 100
ratios.append(ratio)
for i, ratio in enumerate(ratios):
print(f"ratio: {df_names[i*2+1]} to {df_names[i*2]}: {ratio:.2f}%")
ratio: h_bin0_B to helices_B: 27.45% ratio: s_bin0_B to strands_B: 40.54% ratio: coils_bin0_B to coils_B: 21.57%
#Chi2, BRCT:
#H0: no correlation
len_hBRCT=len(helices_df_BRCT)
len_hBRCT0=len(helices_df_BRCT_bin0)
len_cBRCT=len(coils_df_BRCT)
len_cBRCT0=len(coils_df_BRCT_bin0)
len_sBRCT=len(strands_df_BRCT)
len_sBRCT0=len(strands_df_BRCT_bin0)
observed_data_BRCT = np.array([[len_cBRCT, len_hRING, len_sBRCT],[len_cBRCT0, len_hBRCT0, len_sBRCT0]])
chi2, p_value, dof, expected = chi2_contingency(observed_data_BRCT)
print(f"Chi-square statistic: {chi2}")
print(f"P-value: {p_value}")
print(f"Degrees of freedom: {dof}")
print("Expected counts:")
print(expected)
Chi-square statistic: 15.036107804784002 P-value: 0.0005431886360097251 Degrees of freedom: 2 Expected counts: [[672.2040404 336.4969697 164.2989899] [178.7959596 89.5030303 43.7010101]]
Since the P-value is <0.05 there is a strong indication to reject the H0 hypothesis. So in this domain the chances of a SNV leading to a bin_0 score is different for each secondary structure.
BRCT = df[575:]
new_BRCT = BRCT.copy()
new_BRCT['secondary structures'] = BRCT['position'].apply(lambda x: 'S' if str(x) in strands_BRCT else ('H' if str(x) in helices_BRCT else 'C'))
positions = new_BRCT['position']
structures = new_BRCT['secondary structures']
colors = ['crimson' if structure == 'H' else 'dimgrey' if structure == "C" else "indigo" for structure in structures]
# scatter plot:
plt.scatter(positions, structures, c=colors)
# Set the tick positions and labels on the X-axis
tick_positions = np.arange(min(positions) -1, max(positions) + 1, 10)
plt.xticks(tick_positions, rotation='vertical')
plt.xlabel('Position', fontsize=30)
plt.ylabel('Secondary Structures', fontsize=30)
plt.title('Position vs. Secondary Structure - BRCT', fontsize=35)
fig = plt.gcf()
fig.set_size_inches(40, 6)
plt.gca().yaxis.set_tick_params(labelsize=20)
plt.show()
#secondary structures + dms score_BRCT:
# Remove duplicates or aggregate the data
new_BRCT_agg = new_BRCT.pivot_table(index='secondary structures', columns='mutant', values='DMS_score_bin')
# Get the order of mutants from the original DataFrame
mutant_order1 = new_BRCT['mutant'].unique()
# Reindex the columns of the pivot table to match the desired order
new_BRCT_agg = new_BRCT_agg.reindex(index=new_BRCT_agg.index[::-1],columns=mutant_order1)
structure_order = new_BRCT_agg.index.tolist()
structure_order[0], structure_order[1] = structure_order[1], structure_order[0]
new_BRCT_agg = new_BRCT_agg.reindex(index=structure_order)
# Plot creation
from matplotlib.colors import ListedColormap
colors = ['indianred', 'royalblue']
cmap = ListedColormap(colors)
plt.figure(figsize=(350, 50))
heatmap=sns.heatmap(new_BRCT_agg, cmap=cmap,linewidths=0.5, fmt=".2f",cbar_kws={"label":"DMS_score_bin"})
plt.xlabel("Mutant", fontsize=150)
plt.ylabel("Secondary Structures",fontsize=150)
heatmap.set_yticklabels(heatmap.get_yticklabels(), fontsize=120)
cbar = heatmap.collections[0].colorbar
cbar.ax.tick_params(labelsize=100)
cbar.ax.set_ylabel('DMS_score_bin', fontsize=150)
cbar.set_ticks([0.25, 0.75])
cbar.set_ticklabels([0, 1])
plt.show()
#importing the data from the msa
fasta_file = {}
key = None
value = ['']
with open(r"Resources/aln-fasta_final.txt",'r') as f:
for line in f:
if line.startswith('>'):
fasta_file[key] = "".join(value)
reading=True
key = line.strip()
value = []
continue
else:
value.append(line.strip())
#create sequence logo with full sequence
def create_sequence_logo(sequence_data, split_positions=None, columns=1, figsize=(16, 8), scaling_factor=0.04):
amino_acids_to_plot = ['-',"A","R","N","D","C","Q","E","G","H","I","L","K","M","F","P","S","T","W","Y","V",'X']
# Filter out empty sequences
sequence_data = {k: v for k, v in sequence_data.items() if v}
# Calculate the maximum length of the sequences
max_length = max(len(seq) for seq in sequence_data.values())
# Create a dictionary to store the frequency of each amino acid at each position
amino_acid_frequencies = {aa: [0] * max_length for aa in amino_acids_to_plot}
# Iterate over the sequences and update the amino acid frequencies
for sequence in sequence_data.values():
for i, aa in enumerate(sequence):
amino_acid_frequencies[aa][i] += 1
# Determine the splitting positions
if split_positions is None:
split_positions = [0, max_length]
# Adjust the number of columns if needed
num_plots = len(split_positions) - 1
rows = num_plots // columns + (num_plots % columns > 0)
# Create subplots with increased vertical gap
fig, axs = plt.subplots(rows, columns, figsize=figsize, squeeze=False, gridspec_kw={'hspace': 0.5})
axs = axs.ravel()
# Plotting the amino acid frequencies by position for each subset
term = False
for i, ax in enumerate(axs):
start = split_positions[i]
if i == num_plots-1: # Handle the last subset separately
print('end')
end = max_length
term = True
else:
end = split_positions[i + 1]
subset_data = {k: v[start:end] for k, v in sequence_data.items()}
# Plotting the amino acid frequencies by position for each amino acid
for aa in amino_acids_to_plot:
frequencies = amino_acid_frequencies[aa][start:end]
if any(frequencies):
ax.plot(range(1, len(frequencies) + 1), [f * scaling_factor for f in frequencies], label=aa)
ax.set_xlim(1, end - start + 1)
ax.set_xticks(range(1, end - start + 1))
ax.set_xticklabels(range(start + 1, end + 1))
ax.set_xlabel('Position')
ax.set_ylabel('Frequency')
ax.set_title(f"Sequence Logo - Position {start+1}-{end}")
if term:
break
# Create a single legend outside the loop
handles, labels = ax.get_legend_handles_labels()
fig.legend(handles, labels, loc='center', ncol=1)
# fig.legend(handles, labels, loc=(1.04, 0))
# plt.tight_layout()
plt.show()
split_positions = [0, max(len(seq) for seq in fasta_file.values())] # Manually specify splitting positions
create_sequence_logo(fasta_file, split_positions=split_positions, columns=2, figsize=(150, 10), scaling_factor=0.011)
end
#create a consensus sequence
def create_consensus_sequence(sequence_data):
# Filter out empty sequences
sequence_data = {k: v for k, v in sequence_data.items() if v}
# Calculate the maximum length of the sequences
max_length = max(len(seq) for seq in sequence_data.values())
# Create a dictionary to store the frequency of each amino acid at each position
amino_acid_frequencies = {aa: [0] * max_length for aa in set(''.join(sequence_data.values()))}
# Iterate over the sequences and update the amino acid frequencies
for sequence in sequence_data.values():
for i, aa in enumerate(sequence):
amino_acid_frequencies[aa][i] += 1
# Create the consensus sequence
consensus_sequence = ''
for i in range(max_length):
max_frequency = 0
max_amino_acid = ''
for aa, frequencies in amino_acid_frequencies.items():
frequency = frequencies[i]
if frequency > max_frequency:
max_frequency = frequency
max_amino_acid = aa
consensus_sequence += max_amino_acid
return consensus_sequence
consensus_sequence = create_consensus_sequence(fasta_file)
consensus_sequence
'---------------------------------------------------------------------------------------------------------------------MDLSADRVEEVQNVLNAMQK---ILECPICLELIKEPVSTKCD-HIFCK---------FCMLKLLNQKKGPSQCPLCKNDITKRSLQESTRFSQLVEELLKIIHAFELD---TGLQFAN-SYNFSKK-ENNSPEHLK-EEVSIIQSMGYRNRAKRLRQSEPENPTL--------QETSLSVQLSNLGIVRSLRTKQQIQPQNKSVYIELGSDSSEDTVNKASYCSVGDQELLQITPQGARAET---------------SLNSAKK--AACEFSEKDITNTEHHQSSNKDLNTTE-KHATERHPEKYQGISVSNLHVEPCGTNTHASSLQHENSSLLLTKDRMNVEKAEFCNKSKQPGLARSQQSRWAESKETCNDRQTPSTEKKVDLNADPLYGRKELNKQKPPCSESPRD-SQDVPWITLNSSIQKVNEWFSRSDELLTSD-DSHDGGSESNAEVAGALEV-------------------PNEVDGYSGSSEKIDLLASDPHEALICKSERVHSKPVESNIEDKIFGKTYRRKASLPNLSHV--TENLIIGAFVTEPQI-TQERPLTNKLKRKRRTTSGLHPEDFIKKADLAVVQKTPEKINQGTDQMEQNGQVMNITNNGHENKTKGDYVQKEKNANPTESLEKESAFRTKAEPISSSISNMELELNIHSSKAPKKNRLRRKSSTRHIHALELVVSRNPSPPNHTELQIDSCSSSEEMKK-KNSNQMPVRHSRKLQLMEDKEPATGAKKSNKPNEQISKRLASDTFPELKLTNVPGSFTNCSSSNKLQEFVNPSLQREEKEEKLETIQVSNSTKDPKDLMLSGG-RVLQTE---RSVESTSISLVPDTDYGTQDSISLLEADTLGKAKT-APNQCVSQCAAIENPKELIHGCSKDTRNDTEGFKDPLGHEVNH-SQETSIEMEESELDTQYLQNTFKVSKRQSFALFSNPGNPEKECATVSAHSRSLRKQSPKVTLECEQKEENQGKK-ESNIKHVQAVNTTAGFPVVCQKDKKPGDYAKCS--IKGVSRLCQSSQFRGNETELITANKHGISQNPYHIPPLSPIRSSVKTKCKKNLSEEKFEEHSMSPERAVGNENIIQSTVSTISQNNIRENAFKEASSSSINEVGSST-------NEVGSSINEVGSSGENIQAELGRNRGPKLNAVLRLGLMQPEVYKQSLPVSNCKHPEIKRQGENEGVVQAVNADFSPCLISDNLEQPMGSSHASQVCSETPDDLLDD--DEIKENTSFAESDIKERSAVFSKSVQRGEFRRSPSPFTH-THLAQGHQ-RGARKLESSEENLSSEDEELPCFQHLLFGKVTNTPS-QSTRHSTVATEGLSKKTEENLLSLKNSLNDC--SNQVILAKASQEHHLSEEARCSGSLFSSQCSALEDLTANT--------NTQDPFLMFDPPSKQMRHQSESQEVL-SDKELVSDD----EERETGLEEDN-QEEQSVDSN-LGEAASGYESETSLSEDCSGLSSQSDILTTQ-------QRDTMQDNLIKLQQEMAELEAVLEQHGSQPS-NSSPSLIADSCA--PEDLLNPEQNTSEK---------------------AVLTSEKSSEYPISQNPESLSADKFQVSLDSSTSK-NKEPGMER---------------------SSPSKSQLL-DDRWYVHSHSRSLQNRNCPSQEELIKVVDVEEQQLEKS-------------EPQDLMEQSYLPRQDLEGTP-YLESGISLFSDDPESDPSEDRAPEPAHVGSMPSSTSALKLPQFQVAESAKSPAAAHTTNTAGYNAREESVSREKPELTSSTER-VNKRISMVASGLTPKEFMLVQKFARKHHITLTNLITEETTHVIMKTDAEFVCERTLKYFLGIAGGKWVVSYFWVTQSIKERKMLDEHDFEVRGDVVNGRNHQGPKRARES-----QDRKIFRGLEICCYGPFTNMPT-------------------------------------------------------------------------DQLEWMVQLCGASVVKEPSSFTLGTG---THPVVVVQPDAWTEDSGFHAIGQMCEAPVVTREWVLDSVALYQCQELDT---YLIPQIPHSHY-----------------------------------'
#define the sequence of human BRCA1
brca_human="---------------------------------------------------------------------------------------------------------------------MDLSALRVEEVQNVINAMQK---ILECPICLELIKEPVSTKCD-HIFCK---------FCMLKLLNQKKGPSQCPLCKNDITKRSLQESTRFSQLVEELLKIICAFQLD---TGLEYAN-SYNFAKK-ENNSPEHLK-DEVSIIQSMGYRNRAKRLLQSEPENPSL--------QETSLSVQLSNLGTVRTLRTKQRIQPQKTSVYIELGSDSSEDTVNKATYCSVGDQELLQITPQGTRDEI---------------SLDSAKK--AACEFSETDVTNTEHHQPSNNDLNTTE-KRAAERHPEKYQGSSVSNLHVEPCGTNTHASSLQHENSSLLLTKDRMNVEKAEFCNKSKQPGLARSQHNRWAGSKETCNDRRTPSTEKKVDLNADPLCERKEWNKQKLPCSENPRD-TEDVPWITLNSSIQKVNEWFSRSDELLGSD-DSHDGESESNAKVADVLDV-------------------LNEVDEYSGSSEKIDLLASDPHEALICKSERVHSKSVESNIEDKIFGKTYRKKASLPNLSHV--TENLIIGAFVTEPQI-IQERPLTNKLKRKRRPTSGLHPEDFIKKADLAV-QKTPEMINQGTNQTEQNGQVMNITNSGHENKTKGDSIQNEKNPNPIESLEKESAFKTKAEPISSSISNMELELNIHNSKAPKKNRLRRKSSTRHIHALELVVSRNLSPPNCTELQIDSCSSSEEIKK-KKYNQMPVRHSRNLQLMEGKEPATGAKKSNKPNEQTSKRHDSDTFPELKLTNAPGSFTKCSNTSELKEFVNPSLPREEKEEKLETVKVSNNAEDPKDLMLSGE-RVLQTE---RSVESSSISLVPGTDYGTQESISLLEVSTLGKAKT-EPNKCVSQCAAFENPKGLIHGCSKDNRNDTEGFKYPLGHEVNH-SRETSIEMEESELDAQYLQNTFKVSKRQSFAPFSNPGNAEEECATFSAHSGSLKKQSPKVTFECEQKEENQGKN-ESNIKPVQTVNITAGFPVVGQKDKP-VDNAKCS--IKGGSRFCLSSQFRGNETGLITPNKHGLLQNPYRIPPLFPIKSFVKTKCKKNLLEENFEEHSMSPEREMGNEN-IPSTVSTISRNNIRENVFKEASSSNINEVGSST-------NEVGSSINEIGSSDENIQAELGRNRGPKLNAMLRLGVLQPEVYKQSLPGSNCKHPEIKKQEY-EEVVQTVNTDFSPYLISDNLEQPMGSSHASQVCSETPDDLLDD--GEIKEDTSFAENDIKESSAVFSKSVQKGELSRSPSPFTH-THLAQGYR-RGAKKLESSEENLSSEDEELPCFQHLLFGKVNNIPS-QSTRHSTVATECLSKNTEENLLSLKNSLNDC--SNQVILAKASQEHHLSEETKCSASLFSSQCSELEDLTANT--------NTQDPFLIG--SSKQMRHQSESQGVGLSDKELVSDD----EERGTGLEENN-QEEQSMDSN-LGEAASGCESETSVSEDCSGLSSQSDILTTQ-------QRDTMQHNLIKLQQEMAELEAVLEQHGSQPS-NSYPSIISDSSA--LEDLRNPEQSTSEK---------------------AVLTSQKSSEYPISQNPEGLSADKFEVSADSSTSK-NKEPGVER---------------------SSPSKCPSL-DDRWYMHSCSGSLQNRNYPSQEELIKVVDVEEQQLEES-------------GPHDLTETSYLPRQDLEGTP-YLESGISLFSDDPESDPSEDRAPESARVGNIPSSTSALKVPQLKVAESAQSPAAAHTTDTAGYNAMEESVSREKPELTASTER-VNKRMSMVVSGLTPEEFMLVYKFARKHHITLTNLITEETTHVVMKTDAEFVCERTLKYFLGIAGGKWVVSYFWVTQSIKERKMLNEHDFEVRGDVVNGRNHQGPKRARES-----QDRKIFRGLEICCYGPFTNMPT-------------------------------------------------------------------------DQLEWMVQLCGASVVKELSSFTLGTG---VHPIVVVQPDAWTEDNGFHAIGQMCEAPVVTREWVLDSVALYQCQELDT---YLIPQIPHSHY-----------------------------------"
#check percentage of positions in the MSA that score above a certain conservation threshold
def count_positions_above_threshold(sequence_data, threshold):
amino_acids_to_plot = ['-',"A","R","N","D","C","Q","E","G","H","I","L","K","M","F","P","S","T","W","Y","V",'X']
# Filter out empty sequences
sequence_data = {k: v for k, v in sequence_data.items() if v}
# Calculate the maximum length of the sequences
max_length = max(len(seq) for seq in sequence_data.values())
# Create a dictionary to store the frequency of each amino acid at each position
amino_acid_frequencies = {aa: [0] * max_length for aa in amino_acids_to_plot}
# Iterate over the sequences and update the amino acid frequencies
for sequence in sequence_data.values():
for i, aa in enumerate(sequence):
amino_acid_frequencies[aa][i] += 1
# Count the number of positions above the threshold
count = 0
for i in range(max_length):
frequencies = [amino_acid_frequencies[aa][i] for aa in amino_acids_to_plot]
max_frequency = max(frequencies)
if max_frequency / sum(frequencies) > threshold:
count += 1
return count
# % of positions with more than 70% accordance
print(count_positions_above_threshold(fasta_file, 0.7)/len(consensus_sequence))
#% of positions with more than 80% accordance
print(count_positions_above_threshold(fasta_file, 0.8)/len(consensus_sequence))
#% of positions with more than 90% accordance
print(count_positions_above_threshold(fasta_file, 0.9)/len(consensus_sequence))
#% of positions with more than 95% accordance
print(count_positions_above_threshold(fasta_file, 0.95)/len(consensus_sequence))
0.8110477860587462 0.6861025865848313 0.4270056992547128 0.25208241999123193
#create sequence logo with split sequence
def create_sequence_logo(sequence_data, split_positions=None, columns=1, figsize=(16, 8), scaling_factor=0.04):
amino_acids_to_plot = ['-',"A","R","N","D","C","Q","E","G","H","I","L","K","M","F","P","S","T","W","Y","V",'X']
# Filter out empty sequences
sequence_data = {k: v for k, v in sequence_data.items() if v}
# Calculate the maximum length of the sequences
max_length = max(len(seq) for seq in sequence_data.values())
# Create a dictionary to store the frequency of each amino acid at each position
amino_acid_frequencies = {aa: [0] * max_length for aa in amino_acids_to_plot}
# Iterate over the sequences and update the amino acid frequencies
for sequence in sequence_data.values():
for i, aa in enumerate(sequence):
amino_acid_frequencies[aa][i] += 1
# Determine the splitting positions
if split_positions is None:
split_positions = [0, max_length]
# Adjust the number of columns if needed
num_plots = len(split_positions) - 1
rows = num_plots // columns + (num_plots % columns > 0)
# Create subplots with increased vertical gap
fig, axs = plt.subplots(rows, columns, figsize=figsize, squeeze=False, gridspec_kw={'hspace': 0.5})
axs = axs.ravel()
# Plotting the amino acid frequencies by position for each subset
term = False
for i, ax in enumerate(axs):
start = split_positions[i]
if i == num_plots-1: # Handle the last subset separately
print('end')
end = max_length
term = True
else:
end = split_positions[i + 1]
subset_data = {k: v[start:end] for k, v in sequence_data.items()}
# Plotting the amino acid frequencies by position for each amino acid
for aa in amino_acids_to_plot:
frequencies = amino_acid_frequencies[aa][start:end]
if any(frequencies):
ax.plot(range(1, len(frequencies) + 1), [f * scaling_factor for f in frequencies], label=aa)
ax.set_xlim(1, end - start + 1)
ax.set_xticks(range(1, end - start + 1))
ax.set_xticklabels(range(start + 1, end + 1))
ax.set_xlabel('Position')
ax.set_ylabel('Frequency')
ax.set_title(f"Sequence Logo - Position {start+1}-{end}")
if term:
break
# Create a single legend outside the loop
handles, labels = ax.get_legend_handles_labels()
fig.legend(handles, labels, loc='center', ncol=1)
# fig.legend(handles, labels, loc=(1.04, 0))
# plt.tight_layout()
plt.show()
split_positions = [0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, max(len(seq) for seq in fasta_file.values())] # Manually specify splitting positions
create_sequence_logo(fasta_file, split_positions=split_positions, columns=2, figsize=(50, 30), scaling_factor=0.011)
end
#create sequence logo with sequence split into domains
def create_sequence_logo(sequence_data, split_positions=None, columns=1, figsize=(16, 8), scaling_factor=0.04):
amino_acids_to_plot = ['-',"A","R","N","D","C","Q","E","G","H","I","L","K","M","F","P","S","T","W","Y","V",'X']
# Filter out empty sequences
sequence_data = {k: v for k, v in sequence_data.items() if v}
# Calculate the maximum length of the sequences
max_length = max(len(seq) for seq in sequence_data.values())
# Create a dictionary to store the frequency of each amino acid at each position
amino_acid_frequencies = {aa: [0] * max_length for aa in amino_acids_to_plot}
# Iterate over the sequences and update the amino acid frequencies
for sequence in sequence_data.values():
for i, aa in enumerate(sequence):
amino_acid_frequencies[aa][i] += 1
# Determine the splitting positions
if split_positions is None:
split_positions = [0, max_length]
# Adjust the number of columns if needed
num_plots = len(split_positions) - 1
rows = num_plots // columns + (num_plots % columns > 0)
# Create subplots with increased vertical gap
fig, axs = plt.subplots(rows, columns, figsize=figsize, squeeze=False, gridspec_kw={'hspace': 0.5})
axs = axs.ravel()
# Plotting the amino acid frequencies by position for each subset
term = False
for i, ax in enumerate(axs):
start = split_positions[i]
if i == num_plots-1: # Handle the last subset separately
print('end')
end = max_length
term = True
else:
end = split_positions[i + 1]
subset_data = {k: v[start:end] for k, v in sequence_data.items()}
# Plotting the amino acid frequencies by position for each amino acid
for aa in amino_acids_to_plot:
frequencies = amino_acid_frequencies[aa][start:end]
if any(frequencies):
ax.plot(range(1, len(frequencies) + 1), [f * scaling_factor for f in frequencies], label=aa)
ax.set_xlim(1, end - start + 1)
ax.set_xticks(range(1, end - start + 1))
ax.set_xticklabels(range(start + 1, end + 1))
ax.set_xlabel('Position')
ax.set_ylabel('Frequency')
ax.set_title(f"Sequence Logo - Position {start+1}-{end}")
if term:
break
# Create a single legend outside the loop
handles, labels = ax.get_legend_handles_labels()
fig.legend(handles, labels, loc='center', ncol=1)
# fig.legend(handles, labels, loc=(1.04, 0))
# plt.tight_layout()
plt.show()
split_positions = [0,352,1929, max(len(seq) for seq in fasta_file.values())] # Manually specify splitting positions
create_sequence_logo(fasta_file, split_positions=split_positions, columns=2, figsize=(60, 8), scaling_factor=0.011)
end
#create comparison between human BRCA1 and the consensus sequence
def compare_sequences(consensus_sequence, other_sequence, split_positions=None, columns=1, figsize=(16, 8)):
# Check if the sequences have the same length
if len(consensus_sequence) != len(other_sequence):
print("Sequences have different lengths.")
return
# Determine the splitting positions
if split_positions is None:
split_positions = [0, len(consensus_sequence)]
# Adjust the number of columns if needed
num_plots = len(split_positions) - 1
rows = num_plots // columns + (num_plots % columns > 0)
# Create subplots
fig, axs = plt.subplots(rows, columns, figsize=figsize, squeeze=False)
axs = axs.ravel()
# Iterate over the subsets
for i, ax in enumerate(axs):
start = split_positions[i]
end = split_positions[i + 1]
# Create a plot to visualize the sequences within the subset
for j in range(start, end):
consensus_aa = consensus_sequence[j]
other_aa = other_sequence[j]
if consensus_aa == other_aa:
ax.text(j - start, 0.5, consensus_aa, color='green', fontsize=60, ha='center')
else:
ax.text(j - start, 0.7, consensus_aa, color='green', fontsize=60, ha='center')
ax.text(j - start, 0.3, other_aa, color='red', fontsize=60, ha='center')
# Set plot properties for each subset
ax.set_xlim(-0.5, end - start - 0.5)
ax.set_ylim(0, 1)
ax.set_xticks(range(end - start))
ax.set_yticks([])
ax.set_xlabel('Position')
ax.set_title(f'Subset {i+1}')
plt.tight_layout()
plt.show()
# Example usage:
split_positions = [0, len(consensus_sequence)] # Manually specify splitting positions
compare_sequences(consensus_sequence, brca_human, split_positions=split_positions, columns=2, figsize=(200, 30))
--------------------------------------------------------------------------- IndexError Traceback (most recent call last) Cell In[25], line 51 47 # Example usage: 49 split_positions = [0, len(consensus_sequence)] # Manually specify splitting positions ---> 51 compare_sequences(consensus_sequence, brca_human, split_positions=split_positions, columns=2, figsize=(200, 30)) Cell In[25], line 24, in compare_sequences(consensus_sequence, other_sequence, split_positions, columns, figsize) 22 for i, ax in enumerate(axs): 23 start = split_positions[i] ---> 24 end = split_positions[i + 1] 26 # Create a plot to visualize the sequences within the subset 27 for j in range(start, end): IndexError: list index out of range
#split up again
def compare_sequences(consensus_sequence, other_sequence, split_positions=None, columns=1, figsize=(16, 8)):
# Check if the sequences have the same length
if len(consensus_sequence) != len(other_sequence):
print("Sequences have different lengths.")
return
# Determine the splitting positions
if split_positions is None:
split_positions = [0, len(consensus_sequence)]
# Adjust the number of columns if needed
num_plots = len(split_positions) - 1
rows = num_plots // columns + (num_plots % columns > 0)
# Create subplots
fig, axs = plt.subplots(rows, columns, figsize=figsize, squeeze=False)
axs = axs.ravel()
# Iterate over the subsets
for i, ax in enumerate(axs):
start = split_positions[i]
end = split_positions[i + 1]
# Create a plot to visualize the sequences within the subset
for j in range(start, end):
consensus_aa = consensus_sequence[j]
other_aa = other_sequence[j]
if consensus_aa == other_aa:
ax.text(j - start, 0.5, consensus_aa, color='green', fontsize=20, ha='center')
else:
ax.text(j - start, 0.7, consensus_aa, color='green', fontsize=20, ha='center')
ax.text(j - start, 0.3, other_aa, color='red', fontsize=20, ha='center')
# Set plot properties for each subset
ax.set_xlim(-0.5, end - start - 0.5)
ax.set_ylim(0, 1)
ax.set_xticks(range(end - start))
ax.set_yticks([])
ax.set_xlabel('Position')
ax.set_title(f'Subset {i+1}')
plt.tight_layout()
plt.show()
# Example usage:
split_positions = [0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, len(consensus_sequence)] # Manually specify splitting positions
compare_sequences(consensus_sequence, brca_human, split_positions=split_positions, columns=2, figsize=(50, 30))
--------------------------------------------------------------------------- IndexError Traceback (most recent call last) Cell In[27], line 51 47 # Example usage: 49 split_positions = [0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, len(consensus_sequence)] # Manually specify splitting positions ---> 51 compare_sequences(consensus_sequence, brca_human, split_positions=split_positions, columns=2, figsize=(50, 30)) Cell In[27], line 24, in compare_sequences(consensus_sequence, other_sequence, split_positions, columns, figsize) 22 for i, ax in enumerate(axs): 23 start = split_positions[i] ---> 24 end = split_positions[i + 1] 26 # Create a plot to visualize the sequences within the subset 27 for j in range(start, end): IndexError: list index out of range
#split up into domains again
def compare_sequences(consensus_sequence, other_sequence, split_positions=None, columns=1, figsize=(16, 8)):
# Check if the sequences have the same length
if len(consensus_sequence) != len(other_sequence):
print("Sequences have different lengths.")
return
# Determine the splitting positions
if split_positions is None:
split_positions = [0, len(consensus_sequence)]
# Adjust the number of columns if needed
num_plots = len(split_positions) - 1
rows = num_plots // columns + (num_plots % columns > 0)
# Create subplots
fig, axs = plt.subplots(rows, columns, figsize=figsize, squeeze=False)
axs = axs.ravel()
# Iterate over the subsets
for i, ax in enumerate(axs):
start = split_positions[i]
end = split_positions[i + 1]
# Create a plot to visualize the sequences within the subset
for j in range(start, end):
consensus_aa = consensus_sequence[j]
other_aa = other_sequence[j]
if consensus_aa == other_aa:
ax.text(j - start, 0.5, consensus_aa, color='green', fontsize=60, ha='center')
else:
ax.text(j - start, 0.7, consensus_aa, color='green', fontsize=60, ha='center')
ax.text(j - start, 0.3, other_aa, color='red', fontsize=60, ha='center')
# Set plot properties for each subset
ax.set_xlim(-0.5, end - start - 0.5)
ax.set_ylim(0, 1)
ax.set_xticks(range(end - start))
ax.set_yticks([])
ax.set_xlabel('Position')
ax.set_title(f'Subset {i+1}')
plt.tight_layout()
plt.show()
# Example usage:
split_positions = [0,352,1929, max(len(seq) for seq in fasta_file.values())] # Manually specify splitting positions
compare_sequences(consensus_sequence, brca_human, split_positions=split_positions, columns=2, figsize=(200, 20))
--------------------------------------------------------------------------- IndexError Traceback (most recent call last) Cell In[29], line 50 47 # Example usage: 49 split_positions = [0,352,1929, max(len(seq) for seq in fasta_file.values())] # Manually specify splitting positions ---> 50 compare_sequences(consensus_sequence, brca_human, split_positions=split_positions, columns=2, figsize=(200, 20)) Cell In[29], line 24, in compare_sequences(consensus_sequence, other_sequence, split_positions, columns, figsize) 22 for i, ax in enumerate(axs): 23 start = split_positions[i] ---> 24 end = split_positions[i + 1] 26 # Create a plot to visualize the sequences within the subset 27 for j in range(start, end): IndexError: list index out of range
data_raw = BRCA1_raw.copy()
position = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, ..., 1855, 1855, 1855, 1855, 1855]
position_df = pd.DataFrame({'position': position})
data_with_position = pd.concat([data_raw, position_df], axis=1)
data_raw['position'] = data_raw['mutant'].str.extract('(\d+)').astype(int)
print(data_raw.columns)
Index(['mutant', 'mutated_sequence', 'DMS_score', 'DMS_score_bin', 'position'], dtype='object')
test1=data_raw.mutant.str.extract(r'(\d)([ARNDCQEGHILKTMfPOUSTWYV])')
test1
| 0 | 1 | |
|---|---|---|
| 0 | 1 | I |
| 1 | 1 | V |
| 2 | 1 | T |
| 3 | 1 | R |
| 4 | 1 | L |
| ... | ... | ... |
| 1832 | 5 | R |
| 1833 | 5 | M |
| 1834 | 5 | L |
| 1835 | 5 | K |
| 1836 | 5 | T |
1837 rows × 2 columns
data_raw["new_aa"]=test1[1]
data_raw.head
<bound method NDFrame.head of mutant mutated_sequence DMS_score \
0 M1I IDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -2.502128
1 M1V VDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -2.025645
2 M1T TDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -1.656569
3 M1R RDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -2.481580
4 M1L LDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -2.516529
... ... ... ...
1832 I1855R MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -0.464328
1833 I1855M MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... 0.021941
1834 I1855L MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... 0.233297
1835 I1855K MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -0.863844
1836 I1855T MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -0.291519
DMS_score_bin position new_aa
0 0 1 I
1 0 1 V
2 0 1 T
3 0 1 R
4 0 1 L
... ... ... ...
1832 1 1855 R
1833 1 1855 M
1834 1 1855 L
1835 1 1855 K
1836 1 1855 T
[1837 rows x 6 columns]>
copy_raw = data_raw[data_raw['DMS_score_bin'] == 0].copy()
# Adding a new column 'old_aa'
copy_raw['old_aa'] = copy_raw['mutant'].str[0]
print(data_raw.head())
mutant mutated_sequence DMS_score \ 0 M1I IDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -2.502128 1 M1V VDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -2.025645 2 M1T TDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -1.656569 3 M1R RDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -2.481580 4 M1L LDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... -2.516529 DMS_score_bin position new_aa 0 0 1 I 1 0 1 V 2 0 1 T 3 0 1 R 4 0 1 L
# creating a pivot table
pivot_table = copy_raw.pivot_table(index='new_aa', columns='position', values='DMS_score', aggfunc='mean')
fig, ax = plt.subplots(figsize=(50,10))
# heatmap
sns.heatmap(pivot_table, cmap='coolwarm_r')
# adding annotations for the "old_aa" values
for i in range(len(pivot_table.index)):
for j in range(len(pivot_table.columns)):
new_aa = pivot_table.index[i]
position = pivot_table.columns[j]
old_aa = copy_raw[(copy_raw['new_aa'] == new_aa) & (copy_raw['position'] == position)]['old_aa'].values
if len(old_aa) > 0:
ax.text(j + 0.5, i + 0.5, old_aa[0], ha='center', va='center', fontsize=8)
plt.xlabel('position')
plt.ylabel('new_aa')
plt.title('DMS Scores Heatmap')
plt.show()
# List of polar amino acids and filtering them out of the dataset
polar_amino_acids = ['S', 'T', 'Y', 'C', 'N', 'Q']
polar_data = copy_raw[copy_raw['new_aa'].isin(polar_amino_acids)][['new_aa', 'position', 'DMS_score', 'old_aa']]
# Creating a pivot table
pivot_table = polar_data.pivot_table(index='new_aa', columns='position', values='DMS_score', aggfunc='mean')
plt.figure(figsize=(20, 6))
sns.heatmap(pivot_table, cmap='coolwarm_r', annot=False, linewidths=0.5)
# Add annotations for the "old_aa" values
for i in range(len(pivot_table.index)):
for j in range(len(pivot_table.columns)):
new_aa = pivot_table.index[i]
position = pivot_table.columns[j]
old_aa = polar_data[(polar_data['new_aa'] == new_aa) & (polar_data['position'] == position)]['old_aa'].values
if len(old_aa) > 0:
plt.text(j + 0.5, i + 0.5, old_aa[0], ha='center', va='center', fontsize=8)
plt.xlabel('Position')
plt.ylabel('new_aa polar')
plt.title('DMS Scores of new Polar Amino Acids')
plt.show()
# List of non-polar amino acids and filtering them out of the dataset
non_polar_amino_acids = ['A', 'V', 'L', 'I', 'P', 'M', 'F', 'W', 'G']
non_polar_data = copy_raw[copy_raw['new_aa'].isin(non_polar_amino_acids)][['new_aa', 'position', 'DMS_score']]
# Merge with the original dataframe to include the "old_aa" column
non_polar_data = non_polar_data.merge(copy_raw[['new_aa', 'position', 'old_aa']], on=['new_aa', 'position'])
# Creating a pivot table
pivot_table = non_polar_data.pivot_table(index='new_aa', columns='position', values='DMS_score', aggfunc='mean')
plt.figure(figsize=(30, 6))
sns.heatmap(pivot_table, cmap='coolwarm_r', annot=False, linewidths=0.5)
# Add annotations for the "old_aa" values
for i in range(len(pivot_table.index)):
for j in range(len(pivot_table.columns)):
new_aa = pivot_table.index[i]
position = pivot_table.columns[j]
old_aa = non_polar_data[(non_polar_data['new_aa'] == new_aa) & (non_polar_data['position'] == position)]['old_aa'].values
if len(old_aa) > 0:
plt.text(j + 0.5, i + 0.5, old_aa[0], ha='center', va='center', fontsize=8)
plt.xlabel('Position')
plt.ylabel('new_aa non-polar')
plt.title('DMS Scores of new Non-Polar Amino Acids')
plt.show()
# List of positively charged amino acids and filtering them out of the dataset
positively_charged_amino_acids = ['R', 'H', 'K']
positively_charged_data = copy_raw[copy_raw['new_aa'].isin(positively_charged_amino_acids)][['new_aa', 'position', 'DMS_score', 'old_aa']]
# Creating a pivot table
pivot_table = positively_charged_data.pivot_table(index='new_aa', columns='position', values='DMS_score', aggfunc='mean')
plt.figure(figsize=(30, 6))
sns.heatmap(pivot_table, cmap='coolwarm_r', annot=False, linewidths=0.5)
# Add annotations for the "old_aa" values
for i in range(len(pivot_table.index)):
for j in range(len(pivot_table.columns)):
new_aa = pivot_table.index[i]
position = pivot_table.columns[j]
old_aa = positively_charged_data[(positively_charged_data['new_aa'] == new_aa) & (positively_charged_data['position'] == position)]['old_aa'].values
if len(old_aa) > 0:
plt.text(j + 0.5, i + 0.5, old_aa[0], ha='center', va='center', fontsize=8)
plt.xlabel('Position')
plt.ylabel('new_aa positive charge')
plt.title('DMS Scores of new Positively Charged Amino Acids')
plt.show()
# List of negatively charged amino acids and filtering them out of the dataset
negatively_charged_amino_acids = ['D', 'E']
negatively_charged_data = copy_raw[copy_raw['new_aa'].isin(negatively_charged_amino_acids)][['new_aa', 'position', 'DMS_score', 'old_aa']]
# Creating a pivot table
pivot_table = negatively_charged_data.pivot_table(index='new_aa', columns='position', values='DMS_score', aggfunc='mean')
plt.figure(figsize=(20, 6))
sns.heatmap(pivot_table, cmap='coolwarm_r', annot=False, linewidths=0.5)
# Add annotations for the "old_aa" values
for i in range(len(pivot_table.index)):
for j in range(len(pivot_table.columns)):
new_aa = pivot_table.index[i]
position = pivot_table.columns[j]
old_aa = negatively_charged_data[(negatively_charged_data['new_aa'] == new_aa) & (negatively_charged_data['position'] == position)]['old_aa'].values
if len(old_aa) > 0:
plt.text(j + 0.5, i + 0.5, old_aa[0], ha='center', va='center', fontsize=8)
plt.xlabel('Position')
plt.ylabel('new_aa negative charge')
plt.title('DMS Scores of new Negatively Charged Amino Acids')
plt.show()
# List of essential amino acids and filtering them out of the dataset
essential_amino_acids = ['H','I','L','K','M','F','T','W','V']
essential_data = copy_raw[copy_raw['old_aa'].isin(essential_amino_acids)][['old_aa', 'position', 'DMS_score', 'new_aa']]
# Creating a pivot table
pivot_table = essential_data.pivot_table(index='old_aa', columns='position', values='DMS_score', aggfunc='mean')
plt.figure(figsize=(30, 6))
sns.heatmap(pivot_table, cmap='coolwarm_r', annot=False, linewidths=0.5)
# Add annotations for the "new_aa" values
for i in range(len(pivot_table.index)):
for j in range(len(pivot_table.columns)):
old_aa = pivot_table.index[i]
position = pivot_table.columns[j]
new_aa = essential_data[(essential_data['old_aa'] == old_aa) & (essential_data['position'] == position)]['new_aa'].values
if len(new_aa) > 0:
plt.text(j + 0.5, i + 0.5, new_aa[0], ha='center', va='center', fontsize=8)
plt.xlabel('Position')
plt.ylabel('old_aa essential')
plt.title('DMS Scores of Essential Amino Acids')
plt.show()
# List of essential amino acids and filtering them out of the dataset
essential_amino_acids = ['H', 'I', 'L', 'K', 'M', 'F', 'T', 'W', 'V']
essential_data = copy_raw[copy_raw['new_aa'].isin(essential_amino_acids)][['new_aa', 'position', 'DMS_score', 'old_aa']]
# Creating a pivot table
pivot_table = essential_data.pivot_table(index='new_aa', columns='position', values='DMS_score', aggfunc='mean')
plt.figure(figsize=(30, 6))
sns.heatmap(pivot_table, cmap='coolwarm_r', annot=False, linewidths=0.5)
# Add annotations for the "old_aa" values
for i in range(len(pivot_table.index)):
for j in range(len(pivot_table.columns)):
new_aa = pivot_table.index[i]
position = pivot_table.columns[j]
old_aa = essential_data[(essential_data['new_aa'] == new_aa) & (essential_data['position'] == position)]['old_aa'].values
if len(old_aa) > 0:
plt.text(j + 0.5, i + 0.5, old_aa[0], ha='center', va='center', fontsize=8)
plt.xlabel('Position')
plt.ylabel('new_aa essential')
plt.title('DMS Scores of Essential Amino Acids')
plt.show()
# List of non-essential amino acids and filtering them out of the dataset
non_essential_amino_acids = ['A', 'D', 'N', 'E', 'S', 'U', 'O']
non_essential_data = copy_raw[~copy_raw['old_aa'].isin(non_essential_amino_acids)][['old_aa', 'position', 'DMS_score', 'new_aa']]
# Creating a pivot table
pivot_table = non_essential_data.pivot_table(index='old_aa', columns='position', values='DMS_score', aggfunc='mean')
plt.figure(figsize=(20, 6))
sns.heatmap(pivot_table, cmap='coolwarm_r', annot=False, linewidths=0.5)
# Add annotations for the "new_aa" values
for i in range(len(pivot_table.index)):
for j in range(len(pivot_table.columns)):
old_aa = pivot_table.index[i]
position = pivot_table.columns[j]
new_aa = non_essential_data[(non_essential_data['old_aa'] == old_aa) & (non_essential_data['position'] == position)]['new_aa'].values
if len(new_aa) > 0:
plt.text(j + 0.5, i + 0.5, new_aa[0], ha='center', va='center', fontsize=8)
plt.xlabel('Position')
plt.ylabel('old_aa non-essential')
plt.title('DMS Scores of Non-Essential Amino Acids')
plt.show()
# List of non-essential amino acids and filtering them out of the dataset
non_essential_amino_acids = ['A', 'D', 'N', 'E', 'S', 'U', 'O']
non_essential_data = copy_raw[~copy_raw['new_aa'].isin(non_essential_amino_acids)][['new_aa', 'position', 'DMS_score', 'old_aa']]
# Creating a pivot table
pivot_table = non_essential_data.pivot_table(index='new_aa', columns='position', values='DMS_score', aggfunc='mean')
plt.figure(figsize=(20, 6))
sns.heatmap(pivot_table, cmap='coolwarm_r', annot=False, linewidths=0.5)
# Add annotations for the "old_aa" values
for i in range(len(pivot_table.index)):
for j in range(len(pivot_table.columns)):
new_aa = pivot_table.index[i]
position = pivot_table.columns[j]
old_aa = non_essential_data[(non_essential_data['new_aa'] == new_aa) & (non_essential_data['position'] == position)]['old_aa'].values
if len(old_aa) > 0:
plt.text(j + 0.5, i + 0.5, old_aa[0], ha='center', va='center', fontsize=8)
plt.xlabel('Position')
plt.ylabel('new_aa non-essential')
plt.title('DMS Scores of Non-Essential Amino Acids')
plt.show()
copy_phosphorylation = copy_raw.loc[copy_raw['old_aa'].isin(['S', 'T', 'Y'])].copy()
# Filtering the dataset for rows excluding 'S', 'T', or 'Y' in the "new_aa" column
filtered_data = copy_phosphorylation.loc[~copy_phosphorylation['new_aa'].isin(['S', 'T', 'Y'])]
# Reshaping the filtered data for heatmap visualization
heatmap_data = filtered_data.pivot("new_aa", "position", "DMS_score")
# Creating a separate dataframe to store the "old_aa" values
old_aa_data = filtered_data.pivot("new_aa", "position", "old_aa")
# the heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(heatmap_data, cmap="coolwarm_r", annot=False, cbar=True)
plt.title("DMS_score phosphorylation")
plt.xlabel("Position")
plt.ylabel("new_aa")
# Add annotations for the "old_aa" values
for i in range(len(heatmap_data.index)):
for j in range(len(heatmap_data.columns)):
plt.text(j + 0.5, i + 0.5, old_aa_data.iloc[i, j], ha='center', va='center', fontsize=8)
plt.show()
/var/folders/px/t7c4vv9555gc6_5d200h7fmr0000gn/T/ipykernel_82123/908054992.py:5: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
heatmap_data = filtered_data.pivot("new_aa", "position", "DMS_score")
/var/folders/px/t7c4vv9555gc6_5d200h7fmr0000gn/T/ipykernel_82123/908054992.py:8: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
old_aa_data = filtered_data.pivot("new_aa", "position", "old_aa")
copy_new_phosphorylation = copy_raw[copy_raw['new_aa'].isin(['S', 'T', 'Y'])].copy()
# Filtering the dataset for rows that have any value except 'S', 'T', or 'Y' in the "old_aa" column
filtered_data = copy_new_phosphorylation[~copy_new_phosphorylation['old_aa'].isin(['S', 'T', 'Y'])]
# Reshaping the filtered data for heatmap visualization
heatmap_data = filtered_data.pivot("new_aa", "position", "DMS_score")
# Creating a separate dataframe to store the "old_aa" values
old_aa_data = filtered_data.pivot("new_aa", "position", "old_aa")
# The heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(heatmap_data, cmap="coolwarm_r", annot=old_aa_data, fmt="", cbar=True)
plt.title("DMS_score phosphorylation")
plt.xlabel("Position")
plt.ylabel("new_aa")
plt.show()
/var/folders/px/t7c4vv9555gc6_5d200h7fmr0000gn/T/ipykernel_82123/2442374556.py:5: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
heatmap_data = filtered_data.pivot("new_aa", "position", "DMS_score")
/var/folders/px/t7c4vv9555gc6_5d200h7fmr0000gn/T/ipykernel_82123/2442374556.py:8: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
old_aa_data = filtered_data.pivot("new_aa", "position", "old_aa")
copy_new_lys = copy_raw[copy_raw['new_aa'].isin(['K'])].copy()
# Filtering the dataset for rows that have 'K' in the "new_aa" column
filtered_data = copy_new_lys[copy_new_lys['new_aa'] == 'K']
# Reshaping the filtered data for heatmap visualization
heatmap_data = filtered_data.pivot("new_aa", "position", "DMS_score")
# Creating a separate dataframe to store the "old_aa" values
old_aa_data = filtered_data.pivot("new_aa", "position", "old_aa")
# The heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(heatmap_data, cmap="coolwarm_r", annot=old_aa_data, fmt="", cbar=True)
plt.title("DMS_score acetylation")
plt.xlabel("Position")
plt.ylabel("new_aa")
plt.show()
/var/folders/px/t7c4vv9555gc6_5d200h7fmr0000gn/T/ipykernel_82123/1979893177.py:5: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
heatmap_data = filtered_data.pivot("new_aa", "position", "DMS_score")
/var/folders/px/t7c4vv9555gc6_5d200h7fmr0000gn/T/ipykernel_82123/1979893177.py:8: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
old_aa_data = filtered_data.pivot("new_aa", "position", "old_aa")
# Creating a copy of the dataset with rows that have 'K' in the "old_aa" column
copy_old_lys = copy_raw[copy_raw['old_aa'].str.contains('K')].copy()
# Filtering the dataset for rows that have 'K' in the "old_aa" column
filtered_data = copy_old_lys[copy_old_lys['old_aa'] == 'K']
# Reshaping the filtered data for heatmap visualization
heatmap_data = filtered_data.pivot("new_aa", "position", "DMS_score")
# Creating a separate dataframe to store the "old_aa" values
old_aa_data = filtered_data.pivot("new_aa", "position", "old_aa")
# The heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(heatmap_data, cmap="coolwarm_r", annot=old_aa_data, fmt="", cbar=True)
plt.title("DMS_score acetylationg")
plt.xlabel("Position")
plt.ylabel("new_aa")
plt.show()
/var/folders/px/t7c4vv9555gc6_5d200h7fmr0000gn/T/ipykernel_82123/3347817003.py:5: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
heatmap_data = filtered_data.pivot("new_aa", "position", "DMS_score")
/var/folders/px/t7c4vv9555gc6_5d200h7fmr0000gn/T/ipykernel_82123/3347817003.py:8: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
old_aa_data = filtered_data.pivot("new_aa", "position", "old_aa")
copy_old_glyc = copy_raw[copy_raw['old_aa'].isin(['S', 'T', 'N'])].copy()
# Creating a pivot table for DMS scores
pivot_table = copy_old_glyc.pivot('new_aa', 'position', 'DMS_score')
# The heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(pivot_table, cmap='coolwarm_r', linewidths=0.5, linecolor='lightgray')
# Add annotations for the "old_aa" values
for i in range(len(pivot_table.index)):
for j in range(len(pivot_table.columns)):
mask = (copy_old_glyc['new_aa'] == pivot_table.index[i]) & (copy_old_glyc['position'] == pivot_table.columns[j])
if mask.any():
plt.text(j + 0.5, i + 0.5, copy_old_glyc.loc[mask, 'old_aa'].values[0],
ha='center', va='center', fontsize=8)
plt.title('DMS glycolysation')
plt.xlabel('Position')
plt.ylabel('new_aa')
plt.show()
/var/folders/px/t7c4vv9555gc6_5d200h7fmr0000gn/T/ipykernel_82123/1995355757.py:2: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
pivot_table = copy_old_glyc.pivot('new_aa', 'position', 'DMS_score')
filtered_data = copy_old_glyc[~copy_old_glyc['old_aa'].isin(['S', 'T', 'N'])]
copy_new_glyc = copy_raw[copy_raw['new_aa'].isin(['S', 'T', 'N'])].copy()
# Creating a pivot table for DMS scores
pivot_table = copy_new_glyc.pivot('new_aa', 'position', 'DMS_score')
# The heatmap
plt.figure(figsize=(23, 10))
sns.heatmap(pivot_table, cmap='coolwarm_r', linewidths=0.5, linecolor='lightgray')
# Adding annotations for the "old_aa" values
for i in range(len(pivot_table.index)):
for j in range(len(pivot_table.columns)):
mask = (copy_new_glyc['new_aa'] == pivot_table.index[i]) & (copy_new_glyc['position'] == pivot_table.columns[j])
if mask.any():
plt.text(j + 0.5, i + 0.5, copy_new_glyc.loc[mask, 'old_aa'].values[0],
ha='center', va='center', fontsize=8)
plt.title('DMS glycolysation')
plt.xlabel('Position')
plt.ylabel('new_aa')
plt.show()
/var/folders/px/t7c4vv9555gc6_5d200h7fmr0000gn/T/ipykernel_82123/437836168.py:2: FutureWarning: In a future version of pandas all arguments of DataFrame.pivot will be keyword-only.
pivot_table = copy_new_glyc.pivot('new_aa', 'position', 'DMS_score')
data_raw
| mutant | mutated_sequence | DMS_score | DMS_score_bin | position | new_aa | |
|---|---|---|---|---|---|---|
| 0 | M1I | IDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... | -2.502128 | 0 | 1 | I |
| 1 | M1V | VDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... | -2.025645 | 0 | 1 | V |
| 2 | M1T | TDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... | -1.656569 | 0 | 1 | T |
| 3 | M1R | RDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... | -2.481580 | 0 | 1 | R |
| 4 | M1L | LDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... | -2.516529 | 0 | 1 | L |
| ... | ... | ... | ... | ... | ... | ... |
| 1832 | I1855R | MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... | -0.464328 | 1 | 1855 | R |
| 1833 | I1855M | MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... | 0.021941 | 1 | 1855 | M |
| 1834 | I1855L | MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... | 0.233297 | 1 | 1855 | L |
| 1835 | I1855K | MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... | -0.863844 | 1 | 1855 | K |
| 1836 | I1855T | MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... | -0.291519 | 1 | 1855 | T |
1837 rows × 6 columns
data_fit = df[df['DMS_score_bin'] == 1].copy()
data_fit = copy_raw[copy_raw['DMS_score_bin'] == 1][['mutant', 'mutated_sequence', 'DMS_score', 'DMS_score_bin', 'position', 'new_aa', 'old_aa']].copy()
data_fit['old_aa'] = data_fit['mutant'].str[0]
data_fit
| mutant | mutated_sequence | DMS_score | DMS_score_bin | position | new_aa | old_aa | |
|---|---|---|---|---|---|---|---|
| 6 | D2N | MNLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... | -0.682954 | 1 | 2 | N | D |
| 7 | D2A | MALSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... | -0.180782 | 1 | 2 | A | D |
| 8 | D2E | MELSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... | 0.088696 | 1 | 2 | E | D |
| 10 | D2Y | MYLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... | -0.256095 | 1 | 2 | Y | D |
| 13 | L3F | MDFSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... | -0.341923 | 1 | 3 | NaN | L |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 1832 | I1855R | MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... | -0.464328 | 1 | 1855 | R | I |
| 1833 | I1855M | MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... | 0.021941 | 1 | 1855 | M | I |
| 1834 | I1855L | MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... | 0.233297 | 1 | 1855 | L | I |
| 1835 | I1855K | MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... | -0.863844 | 1 | 1855 | K | I |
| 1836 | I1855T | MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... | -0.291519 | 1 | 1855 | T | I |
1363 rows × 7 columns
fit_old_glyc = data_fit[data_fit['old_aa'].isin(['S', 'T', 'N'])].copy()
fit_old_glyc
| mutant | mutated_sequence | DMS_score | DMS_score_bin | position | new_aa | old_aa | |
|---|---|---|---|---|---|---|---|
| 17 | S4Y | MDLYALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... | -0.255390 | 1 | 4 | Y | S |
| 18 | S4A | MDLAALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... | 0.073691 | 1 | 4 | A | S |
| 19 | S4P | MDLPALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... | -0.330096 | 1 | 4 | P | S |
| 20 | S4F | MDLFALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... | -0.495239 | 1 | 4 | NaN | S |
| 21 | S4C | MDLCALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... | -0.245299 | 1 | 4 | C | S |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 1815 | T1852N | MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... | -0.074500 | 1 | 1852 | N | T |
| 1816 | T1852P | MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... | -0.067412 | 1 | 1852 | P | T |
| 1817 | T1852S | MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... | -0.131134 | 1 | 1852 | S | T |
| 1818 | T1852A | MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... | 0.309628 | 1 | 1852 | A | T |
| 1819 | T1852I | MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKF... | -0.152108 | 1 | 1852 | I | T |
236 rows × 7 columns
# Creating the pivot table for the heatmap
heatmap_data = fit_old_glyc.pivot(index='new_aa', columns='position', values='DMS_score')
# The heatmap
plt.figure(figsize=(10, 8))
ax = sns.heatmap(heatmap_data, annot=False, cmap='coolwarm_r', yticklabels=True, xticklabels=True)
# Add annotations for old_aa values
for i in range(len(heatmap_data.index)):
for j in range(len(heatmap_data.columns)):
new_aa = heatmap_data.index[i]
position = heatmap_data.columns[j]
old_aa = fit_old_glyc.loc[(fit_old_glyc['new_aa'] == new_aa) & (fit_old_glyc['position'] == position), 'old_aa']
if not old_aa.empty:
ax.text(j + 0.5, i + 0.5, old_aa.values[0], ha='center', va='center', color='black')
plt.xlabel('Position')
plt.ylabel('new_aa')
plt.title('fit DMS glycolysation')
plt.show()
fit_new_glyc = data_fit[data_fit['new_aa'].isin(['S', 'T', 'N'])].copy()
# Creating the pivot table for the heatmap
heatmap_data = fit_new_glyc.pivot(index='new_aa', columns='position', values='DMS_score')
# The heatmap
plt.figure(figsize=(40, 8))
ax = sns.heatmap(heatmap_data, annot=False, cmap='coolwarm_r', yticklabels=True, xticklabels=True)
# Add annotations for old_aa values
for i in range(len(heatmap_data.index)):
for j in range(len(heatmap_data.columns)):
new_aa = heatmap_data.index[i]
position = heatmap_data.columns[j]
old_aa = fit_new_glyc.loc[(fit_new_glyc['new_aa'] == new_aa) & (fit_new_glyc['position'] == position), 'old_aa']
if not old_aa.empty:
ax.text(j + 0.5, i + 0.5, old_aa.values[0], ha='center', va='center', color='black')
plt.xlabel('Position')
plt.ylabel('new_aa')
plt.title('fit DMS glycolysation')
plt.show()
fit_old_acet = data_fit[data_fit['old_aa'].isin(['K'])].copy()
# Creating the pivot table for the heatmap
heatmap_data = fit_old_acet.pivot(index='new_aa', columns='position', values='DMS_score')
# The heatmap
plt.figure(figsize=(10, 8))
ax = sns.heatmap(heatmap_data, annot=False, cmap='coolwarm_r', yticklabels=True, xticklabels=True)
# Add annotations for old_aa values
for i in range(len(heatmap_data.index)):
for j in range(len(heatmap_data.columns)):
new_aa = heatmap_data.index[i]
position = heatmap_data.columns[j]
old_aa = fit_old_acet.loc[(fit_old_acet['new_aa'] == new_aa) & (fit_old_acet['position'] == position), 'old_aa']
if not old_aa.empty:
ax.text(j + 0.5, i + 0.5, old_aa.values[0], ha='center', va='center', color='black')
plt.xlabel('Position')
plt.ylabel('new_aa')
plt.title('fit DMS acetylation')
plt.show()
fit_new_acet = data_fit[data_fit['new_aa'].isin(['K'])].copy()
# Creating the pivot table for the heatmap
heatmap_data = fit_new_acet.pivot(index='new_aa', columns='position', values='DMS_score')
# The heatmap
plt.figure(figsize=(40, 8))
ax = sns.heatmap(heatmap_data, annot=False, cmap='coolwarm_r', yticklabels=True, xticklabels=True)
# Add annotations for old_aa values
for i in range(len(heatmap_data.index)):
for j in range(len(heatmap_data.columns)):
new_aa = heatmap_data.index[i]
position = heatmap_data.columns[j]
old_aa = fit_new_acet.loc[(fit_new_acet['new_aa'] == new_aa) & (fit_new_acet['position'] == position), 'old_aa']
if not old_aa.empty:
ax.text(j + 0.5, i + 0.5, old_aa.values[0], ha='center', va='center', color='black')
plt.xlabel('Position')
plt.ylabel('new_aa')
plt.title('fit DMS acetylation')
plt.show()
fit_old_phos = data_fit[data_fit['old_aa'].isin(['S', 'T', 'Y'])].copy()
# Creating the pivot table for the heatmap
heatmap_data = fit_old_phos.pivot(index='new_aa', columns='position', values='DMS_score')
# The heatmap
plt.figure(figsize=(10, 8))
ax = sns.heatmap(heatmap_data, annot=False, cmap='coolwarm_r', yticklabels=True, xticklabels=True)
# Add annotations for old_aa values
for i in range(len(heatmap_data.index)):
for j in range(len(heatmap_data.columns)):
new_aa = heatmap_data.index[i]
position = heatmap_data.columns[j]
old_aa = fit_old_phos.loc[(fit_old_phos['new_aa'] == new_aa) & (fit_old_phos['position'] == position), 'old_aa']
if not old_aa.empty:
ax.text(j + 0.5, i + 0.5, old_aa.values[0], ha='center', va='center', color='black')
plt.xlabel('Position')
plt.ylabel('new_aa')
plt.title('fit phosphorylation')
plt.show()
fit_new_phos = data_fit[data_fit['new_aa'].isin(['S', 'T', 'Y'])].copy()
# Creating the pivot table for the heatmap
heatmap_data = fit_new_phos.pivot(index='new_aa', columns='position', values='DMS_score')
# The heatmap
plt.figure(figsize=(40, 8))
ax = sns.heatmap(heatmap_data, annot=False, cmap='coolwarm_r', yticklabels=True, xticklabels=True)
# Add annotations for old_aa values
for i in range(len(heatmap_data.index)):
for j in range(len(heatmap_data.columns)):
new_aa = heatmap_data.index[i]
position = heatmap_data.columns[j]
old_aa = fit_new_phos.loc[(fit_new_phos['new_aa'] == new_aa) & (fit_new_phos['position'] == position), 'old_aa']
if not old_aa.empty:
ax.text(j + 0.5, i + 0.5, old_aa.values[0], ha='center', va='center', color='black')
plt.xlabel('Position')
plt.ylabel('new_aa')
plt.title('fit phosphorylation')
plt.show()